4. User comparison

Table of Contents

  1. Preparation

  2. Functions

  3. Tests

Preparation


In [ ]:
%run "../Functions/3. Per session and per user analysis.ipynb"
print("4. User comparison")

Functions


In [ ]:
def getAllUsers( dataframe ):
    allUserIds = np.array(dataframe['userId'].unique())
    allUserIds = [i for i in allUserIds if not i in ['nan', np.nan, 'null']]
    return allUserIds

In [ ]:
# _source is used as correction source, if we want to include answers to these questions
def getAllUserVectorData( userIds, _rmDF, _gfDF, _source = correctAnswers, _printDebug = True, _binary=True):
    
    # result
    isInitialized = False
    allData = []

    f = FloatProgress(min=0, max=len(userIds))
    display(f)
    
    for userId in userIds:
        #print(str(userId))
        f.value += 1
        dataVector = getUserDataVector(userId, _rmDF = _rmDF, _gfDF = _gfDF, _source = _source, _printDebug = _printDebug, _binary=_binary)
        if not isInitialized:
            isInitialized = True
            allData = dataVector
        else:
            allData = pd.concat([allData, dataVector], axis=1)

    f.close()
    del f
    #print('done')
    return allData

In [ ]:
def getAllUserVectorDataCustom(_rmDF, _gfDF, before, after, gfMode = False, rmMode = True, sessionCount = 1):
    userIds = []

    if (before and after):
        userIds = getSurveysOfUsersWhoAnsweredBoth(_gfDF, gfMode = gfMode, rmMode = rmMode)
    elif before:
        if rmMode:
            userIds = getRMBefores(_gfDF)
        else:
            userIds = getGFBefores(_gfDF)
    elif after:
        if rmMode:
            userIds = getRMAfters(_gfDF)
        else:
            userIds = getGFormAfters(_gfDF)
    if(len(userIds) > 0):
        userIds = userIds[localplayerguidkey]
        allUserVectorData = getAllUserVectorData(userIds, _rmDF = _rmDF, _gfDF = _gfDF)
        allUserVectorData = allUserVectorData.T
        result = allUserVectorData[allUserVectorData['sessionsCount'] == sessionCount].T
        return result
    else:
        print("no matching user")
        return []

In [ ]:
methods = ['pearson', 'kendall', 'spearman']
def plotAllUserVectorDataCorrelationMatrix(
    _allUserVectorData,
    _method = methods[0], 
    _title='RedMetrics Correlations', 
    _abs=False,
    _clustered=False, 
    _figsize = (20,20),
    columnSubset=[] 
):
    
    _progress = FloatProgress(min=0, max=4)
    display(_progress)
    
    # computation of subset
    if len(columnSubset) > 0 and pd.Series(columnSubset).isin(_allUserVectorData.columns).all():
        _allUserVectorData = _allUserVectorData.loc[:,columnSubset]
    
    # computation of correlation matrix
    _m = _method
    if(not (_method in methods)):
        _m = methods[0]
    _correlation = _allUserVectorData.astype(float).corr(_m)
    _progress.value += 1
    if(_abs):
        _correlation = _correlation.abs()
    _progress.value += 1
        
    vmin=-1
    if _abs:
        vmin=0
    vmax=1
        
    # plot
    
    if(_clustered):
    # removing NaNs
    # can't cluster NaN lines in _correlation
    # copied/pasted from '2. Google form analysis.ipynb' plotCorrelationMatrix
        _notNaNsIndices = []
        _notNaNsColumns = []
        for index in _correlation.index:
            if(~pd.isnull(_correlation.loc[index,:]).all()):
                _notNaNsIndices.append(index)
        
        _correlation = _correlation.loc[_notNaNsIndices,_notNaNsIndices]
        _progress.value += 1
        sns.clustermap(
            _correlation,
            cmap=plt.cm.jet,
            square=True,
            figsize=_figsize,
            vmin=vmin,
            vmax=vmax,
        )
    else:
        _fig = plt.figure(figsize=_figsize)
        _ax = plt.subplot(111)
        _ax.set_title(_title)
        _progress.value += 1
        sns.heatmap(
            _correlation,
            ax=_ax,
            cmap=plt.cm.jet,
            square=True,
            vmin=vmin,
            vmax=vmax,
        )
    _progress.value += 1

In [ ]:
def getPercentageCrossCorrect(binarized, figsize=(40,100)):
    
    cbar_kws = dict(orientation= "horizontal")
    #cbar_kws = dict(orientation= "horizontal",location="top")
    #cbar_kws = dict(orientation= "horizontal", position="top")
    
    intermediaryNumerator = getCrossCorrectAnswers(binarized).round().astype(int)*100
    percentagesCrossCorrect = (intermediaryNumerator / binarized.shape[0]).round().astype(int)
    _fig = plt.figure(figsize=figsize)
    _ax = plt.subplot(121)
    _ax.set_title('percentage correct')
    sns.heatmap(
        percentagesCrossCorrect,
        ax=_ax,
        cmap=plt.cm.jet,
        square=True,
        annot=True,
        fmt='d',
        cbar_kws=cbar_kws,
        vmin=0,
        vmax=100,
    )
    
    totalPerQuestion = np.dot(np.ones(binarized.shape[0]), binarized)
    totalPerQuestion[totalPerQuestion == 0] = 1
    percentagesConditionalCrossCorrect = (intermediaryNumerator / totalPerQuestion).round().astype(int).fillna(0)
    _ax = plt.subplot(122)
    _ax.set_title('percentage correct, conditionnally: p(y | x)')
    sns.heatmap(
        percentagesConditionalCrossCorrect,
        ax=_ax,
        cmap=plt.cm.jet,
        square=True,
        annot=True,
        fmt='d',
        cbar_kws=cbar_kws,
        vmin=0,
        vmax=100,
    )
    
    plt.tight_layout()

In [ ]:
def getCompletedRate(_rmdf):
    players = _rmdf[QUserId].nunique()
    completers = _rmdf[_rmdf['type'] == 'complete'][QUserId].nunique()
    return float(completers)/float(players)

In [ ]:
allBinaryUserVectorDataPath = dataFolderPath + "allBinaryUserVectorData/"
allNumericUserVectorDataPath = dataFolderPath + "allNumericUserVectorData/"

In [ ]:
def getAllDataCSVPath(filePathStem, binary=True):
    if binary:
        return allBinaryUserVectorDataPath + filePathStem + csvSuffix
    return allNumericUserVectorDataPath + filePathStem + csvSuffix

In [ ]:
def loadAllDataCSV(filePathStem, binary=True):
    currentDF = pd.read_csv(getAllDataCSVPath(filePathStem, binary=binary), dtype=str)
        
    if currentDF.columns[0] == 'Unnamed: 0':
        currentDF.index = currentDF.loc[:,'Unnamed: 0']
        del currentDF.index.name
        currentDF = currentDF.drop('Unnamed: 0', axis='columns')
    currentDF = currentDF.apply(np.float64)
    return currentDF

In [ ]:
def saveAllDataCSV(allData, filePathStem, binary=True):
    allData.to_csv(getAllDataCSVPath(filePathStem, binary=binary), encoding=csvEncoding)

In [ ]:
regenerateData = False

if regenerateData:
    allBinaryDataPlaytestPhase1PretestPosttestUniqueProfiles = getAllUserVectorData(
        getAllResponders(gfdfPlaytestPhase1PretestPosttestUniqueProfiles),
        _rmDF = rmdfPlaytestPhase1PretestPosttestUniqueProfiles,
        _gfDF = gfdfPlaytestPhase1PretestPosttestUniqueProfiles,
        _source = correctAnswers + demographicAnswers,
        _binary=True )

    allBinaryDataPlaytestPhase1PretestPosttestUniqueProfilesVolunteers = getAllUserVectorData(
        getAllResponders(gfdfPlaytestPhase1PretestPosttestUniqueProfilesVolunteers),
        _rmDF = rmdfPlaytestPhase1PretestPosttestUniqueProfilesVolunteers,
        _gfDF = gfdfPlaytestPhase1PretestPosttestUniqueProfilesVolunteers,
        _source = correctAnswers + demographicAnswers,
        _binary=True )

    allBinaryDataPlaytestPhase2PretestPosttestUniqueProfiles = getAllUserVectorData(
        getAllResponders(gfdfPlaytestPhase2PretestPosttestUniqueProfiles),
        _rmDF = rmdfPlaytestPhase2PretestPosttestUniqueProfiles,
        _gfDF = gfdfPlaytestPhase2PretestPosttestUniqueProfiles,
        _source = correctAnswers + demographicAnswers,
        _binary=True )

    allBinaryDataPlaytestPhase2PretestPosttestUniqueProfilesVolunteers = getAllUserVectorData(
        getAllResponders(gfdfPlaytestPhase2PretestPosttestUniqueProfilesVolunteers),
        _rmDF = rmdfPlaytestPhase2PretestPosttestUniqueProfilesVolunteers,
        _gfDF = gfdfPlaytestPhase2PretestPosttestUniqueProfilesVolunteers,
        _source = correctAnswers + demographicAnswers,
        _binary=True )

    saveAllDataCSV(allBinaryDataPlaytestPhase1PretestPosttestUniqueProfiles, "PlaytestPhase1PretestPosttestUniqueProfiles", binary=True)
    saveAllDataCSV(allBinaryDataPlaytestPhase1PretestPosttestUniqueProfilesVolunteers, "PlaytestPhase1PretestPosttestUniqueProfilesVolunteers", binary=True)
    saveAllDataCSV(allBinaryDataPlaytestPhase2PretestPosttestUniqueProfiles, "PlaytestPhase2PretestPosttestUniqueProfiles", binary=True)
    saveAllDataCSV(allBinaryDataPlaytestPhase2PretestPosttestUniqueProfilesVolunteers, "PlaytestPhase2PretestPosttestUniqueProfilesVolunteers", binary=True)


else:
    allBinaryDataPlaytestPhase1PretestPosttestUniqueProfiles = loadAllDataCSV("PlaytestPhase1PretestPosttestUniqueProfiles", binary=True)
    allBinaryDataPlaytestPhase1PretestPosttestUniqueProfilesVolunteers = loadAllDataCSV("PlaytestPhase1PretestPosttestUniqueProfilesVolunteers", binary=True)
    allBinaryDataPlaytestPhase2PretestPosttestUniqueProfiles = loadAllDataCSV("PlaytestPhase2PretestPosttestUniqueProfiles", binary=True)
    allBinaryDataPlaytestPhase2PretestPosttestUniqueProfilesVolunteers = loadAllDataCSV("PlaytestPhase2PretestPosttestUniqueProfilesVolunteers", binary=True)
    
    
    
if regenerateData:
    allNumericDataPlaytestPhase1PretestPosttestUniqueProfiles = getAllUserVectorData(
        getAllResponders(gfdfPlaytestPhase1PretestPosttestUniqueProfiles),
        _rmDF = rmdfPlaytestPhase1PretestPosttestUniqueProfiles,
        _gfDF = gfdfPlaytestPhase1PretestPosttestUniqueProfiles,
        _source = correctAnswers + demographicAnswers,
        _binary=False )

    allNumericDataPlaytestPhase1PretestPosttestUniqueProfilesVolunteers = getAllUserVectorData(
        getAllResponders(gfdfPlaytestPhase1PretestPosttestUniqueProfilesVolunteers),
        _rmDF = rmdfPlaytestPhase1PretestPosttestUniqueProfilesVolunteers,
        _gfDF = gfdfPlaytestPhase1PretestPosttestUniqueProfilesVolunteers,
        _source = correctAnswers + demographicAnswers,
        _binary=False )

    allNumericDataPlaytestPhase2PretestPosttestUniqueProfiles = getAllUserVectorData(
        getAllResponders(gfdfPlaytestPhase2PretestPosttestUniqueProfiles),
        _rmDF = rmdfPlaytestPhase2PretestPosttestUniqueProfiles,
        _gfDF = gfdfPlaytestPhase2PretestPosttestUniqueProfiles,
        _source = correctAnswers + demographicAnswers,
        _binary=False )

    allNumericDataPlaytestPhase2PretestPosttestUniqueProfilesVolunteers = getAllUserVectorData(
        getAllResponders(gfdfPlaytestPhase2PretestPosttestUniqueProfilesVolunteers),
        _rmDF = rmdfPlaytestPhase2PretestPosttestUniqueProfilesVolunteers,
        _gfDF = gfdfPlaytestPhase2PretestPosttestUniqueProfilesVolunteers,
        _source = correctAnswers + demographicAnswers,
        _binary=False )

    saveAllDataCSV(allNumericDataPlaytestPhase1PretestPosttestUniqueProfiles, "PlaytestPhase1PretestPosttestUniqueProfiles", binary=False)
    saveAllDataCSV(allNumericDataPlaytestPhase1PretestPosttestUniqueProfilesVolunteers, "PlaytestPhase1PretestPosttestUniqueProfilesVolunteers", binary=False)
    saveAllDataCSV(allNumericDataPlaytestPhase2PretestPosttestUniqueProfiles, "PlaytestPhase2PretestPosttestUniqueProfiles", binary=False)
    saveAllDataCSV(allNumericDataPlaytestPhase2PretestPosttestUniqueProfilesVolunteers, "PlaytestPhase2PretestPosttestUniqueProfilesVolunteers", binary=False)
    

else:
    allNumericDataPlaytestPhase1PretestPosttestUniqueProfiles = loadAllDataCSV("PlaytestPhase1PretestPosttestUniqueProfiles", binary=False)
    allNumericDataPlaytestPhase1PretestPosttestUniqueProfilesVolunteers = loadAllDataCSV("PlaytestPhase1PretestPosttestUniqueProfilesVolunteers", binary=False)
    allNumericDataPlaytestPhase2PretestPosttestUniqueProfiles = loadAllDataCSV("PlaytestPhase2PretestPosttestUniqueProfiles", binary=False)
    allNumericDataPlaytestPhase2PretestPosttestUniqueProfilesVolunteers = loadAllDataCSV("PlaytestPhase2PretestPosttestUniqueProfilesVolunteers", binary=False)

In [ ]:
allDataPlaytestPhase1PretestPosttestUniqueProfiles = allBinaryDataPlaytestPhase1PretestPosttestUniqueProfiles
allDataPlaytestPhase1PretestPosttestUniqueProfilesVolunteers = allBinaryDataPlaytestPhase1PretestPosttestUniqueProfilesVolunteers
allDataPlaytestPhase2PretestPosttestUniqueProfiles = allBinaryDataPlaytestPhase2PretestPosttestUniqueProfiles
allDataPlaytestPhase2PretestPosttestUniqueProfilesVolunteers = allBinaryDataPlaytestPhase2PretestPosttestUniqueProfilesVolunteers